require (plyr)
require(dplyr)
require(ggplot2)
require(openxlsx)
require(xtable)
require(stargazer)
require(psych)
require(GGally)
#Other alternatives for the NAs in age
Data_imp <- read.csv("Replication_package/Data_imp.csv")

##use mean values
Data_imp$agena<- ifelse(Data_imp$age==0, NA, Data_imp$age)
Data_imp$age_m<-ifelse(is.na(Data_imp$agena==TRUE), mean(Data_imp$agena, na.rm=TRUE), Data_imp$agena)

#use predicted values
#use a linear regression to predict age (with NAs)
lm.imp1<- lm(Data_imp$agena~ Data_imp$sum_co2+ Data_imp$sum_capacity + Data_imp$capacityWeight+ Data_imp$pwd+Data_imp$co2Weight, data = Data_imp)
#get prediction for data
pred.1 <- predict (lm.imp1)
#create a function to get a completed dataset by imputing the predictions into the missing values
impute <- function (a, a.impute){ ifelse (is.na(a), a.impute, a)
}
#create a new imputed age variabe that has no NAs
Data_imp$age.imp1<-impute(Data_imp$agena, pred.1)

#*******************
#same for NAs in Co2
Data_imp$co2na<- ifelse(Data_imp$sum_co2==0, NA, Data_imp$sum_co2)
Data_imp$co2_m<-ifelse(is.na(Data_imp$co2na==TRUE), mean(Data_imp$co2na, na.rm=TRUE), Data_imp$co2na)

#use predicted values
#use a linear regression to predict age (with NAs)
lm.imp2<- lm(Data_imp$co2na~ Data_imp$sum_capacity + Data_imp$capacityWeight+ Data_imp$pwd + Data_imp$agena, data = Data_imp)
#get prediction for data
pred.2 <- predict (lm.imp2)

#create a new imputed age variabe that has no NAs
Data_imp$co2.imp<-impute(Data_imp$co2na, pred.2)


#get main index and index with imputed values
Data_imp$age_std<-(Data_imp$age - mean(Data_imp$age, na.rm = T))/ sd(Data_imp$age, na.rm = T)
Data_imp$co2_std<-(Data_imp$sum_co2 - mean(Data_imp$sum_co2, na.rm = T))/ sd(Data_imp$sum_co2, na.rm = T)
Data_imp$pwd_std<-(Data_imp$pwd - mean(Data_imp$pwd))/ sd(Data_imp$pwd)
Data_imp$index = (1/3 * Data_imp$age_std)+ (1/3 * Data_imp$pwd_std)+ (1/3 * Data_imp$co2_std)

Data_imp$age_std_m<-(Data_imp$age_m - mean(Data_imp$age_m))/ sd(Data_imp$age_m)
Data_imp$co2_std_m<-(Data_imp$co2_m - mean(Data_imp$co2_m))/ sd(Data_imp$co2_m)
Data_imp$index_m = (1/3 * Data_imp$age_std_m)+ (1/3 * Data_imp$pwd_std)+ (1/3 * Data_imp$co2_std_m)

Data_imp$age_std_imp<-(Data_imp$age.imp1 - mean(Data_imp$age.imp1))/ sd(Data_imp$age.imp1)
Data_imp$co2_std_imp<-(Data_imp$co2.imp- mean(Data_imp$co2.imp))/ sd(Data_imp$co2.imp)
Data_imp$index_imp = (1/3 * Data_imp$age_std_imp)+ (1/3 * Data_imp$pwd_std)+ (1/3 * Data_imp$co2_std_imp)
#Get top 20 for main index 
top20<-top_n(Data_imp, 20, index)
write.csv(top20, "Replication_package/top_20_plot.csv")

#Get top 20 for each index
top20_m<-top_n(Data_imp, 20, index_m)
print(xtable(top20_m, type = "latex"), file = "top20_mean.tex")
countries_top20plant_m<- ddply(top20_m,.(country ),summarize,
                             Capacity=sum(sum_capacity), CO2= sum(sum_co2), Population_weighted_damage =sum(pwd, na.rm = T),Age= mean(age, na.rm = T), Population=sum(gpwv4popula, na.rm = T))
print(xtable(countries_top20plant_m, type = "latex"), file = "countries_top20plants_m.tex")
#
top20_imp<-top_n(Data_imp, 20, index_imp)
print(xtable(top20_imp, type = "latex"), file = "top20_imputed.tex")
countries_top20plant_imp<- ddply(top20_imp,.(country ),summarize,
                               Capacity=sum(sum_capacity), CO2= sum(sum_co2), Population_weighted_damage =sum(pwd, na.rm = T),Age= mean(age, na.rm = T), Population=sum(gpwv4popula, na.rm = T))
print(xtable(countries_top20plant_imp, type = "latex"), file = "countries_top20plants_imp.tex")

#----------------------------------
#Plot
gd_ind<- Data_imp[,c(22,25,28)]
names(gd_ind)[1]<- "Main index"
names(gd_ind)[2]<- "Index mean values"
names(gd_ind)[3]<- "Index predicted values"
ggpairs(gd_ind, axisLabels = "internal", lower = list(continuous = wrap(lowerFn, method = "lm")),  upper = list(continuous = wrap("cor", size = 4)))
